Loading Packages
if (class(try(require(gdata))) == "try-error") install.packages("gdata")
if (class(try(require(ggplot2))) == "try-error") install.packages("ggplot2")
if (class(try(require(plotly))) == "try-error") install.packages("plotly")
if (class(try(require(dplyr))) == "try-error") install.packages("dplyr")
if (class(try(require(DT))) == "try-error") install.packages("DT")
if (class(try(require(reshape))) == "try-error") install.packages("reshape")
Reading the Data
testData <- read.csv("./data/cs-test.csv")
trainData <- read.csv("./data/cs-training.csv")
dataDict <- read.xls("./data/Data Dictionary.xls", sheet = 1)
Data Dictionary
DT::datatable(data = dataDict)
Train Data Table
DT::datatable(data = trainData, options = list(scrollX = TRUE, scrollY = TRUE,
scrollCollapse = TRUE, autoWidth = TRUE, fixedColumns = list(leftColumns = 1)))
Data Summary
Summary all Data
summary(trainData)
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 1 Min. :0.00000 Min. : 0.00
## 1st Qu.: 37501 1st Qu.:0.00000 1st Qu.: 0.03
## Median : 75000 Median :0.00000 Median : 0.15
## Mean : 75000 Mean :0.06684 Mean : 6.05
## 3rd Qu.:112500 3rd Qu.:0.00000 3rd Qu.: 0.56
## Max. :150000 Max. :1.00000 Max. :50708.00
##
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 41.0 1st Qu.: 0.000 1st Qu.: 0.2
## Median : 52.0 Median : 0.000 Median : 0.4
## Mean : 52.3 Mean : 0.421 Mean : 353.0
## 3rd Qu.: 63.0 3rd Qu.: 0.000 3rd Qu.: 0.9
## Max. :109.0 Max. :98.000 Max. :329664.0
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3400 1st Qu.: 5.000 1st Qu.: 0.000
## Median : 5400 Median : 8.000 Median : 0.000
## Mean : 6670 Mean : 8.453 Mean : 0.266
## 3rd Qu.: 8249 3rd Qu.:11.000 3rd Qu.: 0.000
## Max. :3008750 Max. :58.000 Max. :98.000
## NA's :29731
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 1.018 Mean : 0.2404
## 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :54.000 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.757
## 3rd Qu.: 1.000
## Max. :20.000
## NA's :3924
Summary Positive Data
summary(trainData %>% dplyr::filter(SeriousDlqin2yrs == 1))
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 1 Min. :1 Min. : 0.000
## 1st Qu.: 38257 1st Qu.:1 1st Qu.: 0.398
## Median : 75283 Median :1 Median : 0.839
## Mean : 75454 Mean :1 Mean : 4.367
## 3rd Qu.:112962 3rd Qu.:1 3rd Qu.: 1.000
## Max. :149980 Max. :1 Max. :8328.000
##
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 21.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 36.00 1st Qu.: 0.000 1st Qu.: 0.19
## Median : 45.00 Median : 0.000 Median : 0.43
## Mean : 45.93 Mean : 2.388 Mean : 295.12
## 3rd Qu.: 54.00 3rd Qu.: 2.000 3rd Qu.: 0.89
## Max. :101.00 Max. :98.000 Max. :38793.00
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2963 1st Qu.: 4.000 1st Qu.: 0.000
## Median : 4500 Median : 7.000 Median : 0.000
## Mean : 5631 Mean : 7.882 Mean : 2.091
## 3rd Qu.: 6800 3rd Qu.:11.000 3rd Qu.: 1.000
## Max. :250000 Max. :57.000 Max. :98.000
## NA's :1669
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 1.0000 Median : 0.000
## Mean : 0.9885 Mean : 1.828
## 3rd Qu.: 2.0000 3rd Qu.: 1.000
## Max. :29.0000 Max. :98.000
##
## NumberOfDependents
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.9482
## 3rd Qu.:2.0000
## Max. :8.0000
## NA's :179
Summary Negative Data
summary(trainData %>% dplyr::filter(SeriousDlqin2yrs != 1))
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 2 Min. :0 Min. : 0.00
## 1st Qu.: 37453 1st Qu.:0 1st Qu.: 0.03
## Median : 74982 Median :0 Median : 0.13
## Mean : 74968 Mean :0 Mean : 6.17
## 3rd Qu.:112465 3rd Qu.:0 3rd Qu.: 0.49
## Max. :150000 Max. :0 Max. :50708.00
##
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.00 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.: 0.0000 1st Qu.: 0.2
## Median : 52.00 Median : 0.0000 Median : 0.4
## Mean : 52.75 Mean : 0.2801 Mean : 357.2
## 3rd Qu.: 63.00 3rd Qu.: 0.0000 3rd Qu.: 0.9
## Max. :109.00 Max. :98.0000 Max. :329664.0
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3461 1st Qu.: 5.000 1st Qu.: 0.0000
## Median : 5466 Median : 8.000 Median : 0.0000
## Mean : 6748 Mean : 8.494 Mean : 0.1352
## 3rd Qu.: 8333 3rd Qu.:11.000 3rd Qu.: 0.0000
## Max. :3008750 Max. :58.000 Max. :98.0000
## NA's :28062
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 1.00 Median : 0.0000
## Mean : 1.02 Mean : 0.1267
## 3rd Qu.: 2.00 3rd Qu.: 0.0000
## Max. :54.00 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.743
## 3rd Qu.: 1.000
## Max. :20.000
## NA's :3745
Exploratory Data Analasys
Boxplot In Scale
auxData <- trainData[sample(nrow(trainData), round(0.05 * nrow(trainData))),
] %>% dplyr::select(-X) %>% reshape::melt(id = "SeriousDlqin2yrs")
auxData <- auxData %>% inner_join(auxData %>% dplyr::group_by(variable) %>%
summarise(Max = max(value)) %>% data.frame(), by = "variable") %>% dplyr::mutate(ScaledVal = value/Max)
plotly::plot_ly(data = auxData, x = ~variable, y = ~ScaledVal, color = ~as.factor(SeriousDlqin2yrs)) %>%
plotly::add_boxplot(type = "box", boxpoints = "suspectedoutliers") %>% plotly::layout(boxmode = "group")